library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.0 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#Read excel files in folder
files <- list.files(path = "responses", pattern = "\\.csv$")
#read in all separate data files into a single list
#gives warning that can be ignored (last line not empty)
datalist <- lapply(
paste0("responses/", files),
function(x) suppressWarnings(read.table(x, header = F))
)
#determine max length to fill dataframe
max_length <- max(unlist(lapply(datalist,length)))
#fill dataframe
df_filled <- lapply(datalist,function(x) {
ans <- rep(NA,length=max_length);
ans[1:length(x)]<- x;
return(ans)
})
colnamelist <- c("ID", "length", "response", "effect_size_abs", "effect_size", "true_mean1", "true_mean2", "obs_mean_1", "obs_mean2", "obs_mean_dif", "df", "tvalue", "pvalue", "obs_power", "d")
#combine lists into a dataframe, make numeric and rename columns
dat <- do.call(rbind, df_filled) %>%
as.data.frame() %>%
mutate_all(as.numeric) %>%
rename_at(1:15, ~ colnamelist) %>%
# deal with extra values on end of each line
gather(key, val, 16:ncol(.)) %>%
filter(!is.na(val)) %>%
select(-key) %>%
group_by_at(vars(one_of(colnamelist))) %>%
nest()
#create variable for correct/incorrect
#create variable for sig/nonsig
all_data <- dat %>%
mutate(
significant = ifelse(pvalue <= 0.05, 1, 0),
response = recode(response, "0" = 0, "1" = 1, "2" = -1),
right_answer = case_when(
(effect_size == 0) ~ 0,
(effect_size > 0) ~ 1,
(effect_size < 0) ~ -1
),
correct = ifelse(response == right_answer, 1, 0)
)
#subset data, only trials with more than 5 responses
all_data_sub <- filter(all_data, length > 5)
# calculate percents for each judgment as sig for each effect_size
# for use in plots below
sub_pcnt <- all_data_sub %>%
group_by(right_answer, effect_size) %>%
summarise(
pcnt_z = mean(response == 0),
pcnt_p = mean(response == 1),
pcnt_n = mean(response == -1),
sig_z = sum(response == 0 & pvalue < .05)/sum(response == 0),
sig_p = sum(response == 1 & pvalue < .05)/sum(response == 1),
sig_n = sum(response == -1 & pvalue < .05)/sum(response == -1)
) %>%
gather(key, val, pcnt_z:sig_n) %>%
separate(key, c("key", "response")) %>%
spread(key, val) %>%
mutate(
response = recode(response, "z" = 0, "p" = 1, "n" = -1),
pcnt = round(pcnt, 2),
sig = round(ifelse(is.nan(sig), 0, sig), 3),
correct = right_answer == response
)
ggplot() +
geom_rect(data = sub_pcnt,
aes(fill = correct),
xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf,
alpha = 0.3, show.legend = FALSE) +
geom_vline(data = all_data_sub, aes(xintercept = effect_size), color = "red") +
geom_histogram(data = all_data_sub, aes(x=obs_mean_dif),
binwidth = 0.1, color = "black", fill = "white") +
geom_text(data = sub_pcnt, aes(label = pcnt), x = -1.5, y = 25) +
facet_grid(response~effect_size, labeller = label_both) +
theme_bw() +
scale_fill_manual(values = c("white", "grey50"))
ggsave("obs_mean_diff.png", width = 16, height = 10)
ggplot() +
geom_rect(data = sub_pcnt, aes(fill = correct),
xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf,
alpha = 0.3, show.legend = FALSE) +
geom_histogram(data = all_data_sub, aes(x=pvalue),
binwidth = 0.05, boundary = 0,
color = "black", fill = "white", alpha = 0.5) +
geom_text(data = sub_pcnt, aes(label = paste("%sig =", sig*100)), x = .2, y = 50) +
facet_grid(response ~ effect_size, labeller = label_both) +
theme_bw() +
scale_fill_manual(values = c("white", "grey50"))
ggsave("power.png", width = 16, height = 10)
(I filtered out rows with <= 5 observations)
all_data_sub %>%
group_by(effect_size, response, correct) %>%
summarize(
n = n(),
mean_d = mean(-d, na.rm = T), #note -d because d in dataset is calculated in opposite diferection!
power = mean(pvalue <= 0.05),
mean_obs_d = mean(obs_mean_dif, na.rm = T),
obs_power = mean(obs_power, na.rm = T)
) %>%
mutate_if(is.double, round, 3) %>%
filter(n > 5) %>%
knitr::kable()
| effect_size | response | correct | n | mean_d | power | mean_obs_d | obs_power |
|---|---|---|---|---|---|---|---|
| -0.8 | -1 | 1 | 62 | -0.898 | 0.919 | -0.910 | 0.816 |
| -0.5 | -1 | 1 | 48 | -0.700 | 0.917 | -0.678 | 0.736 |
| -0.2 | -1 | 1 | 20 | -0.366 | 0.400 | -0.359 | 0.448 |
| -0.2 | 0 | 0 | 25 | -0.113 | 0.040 | -0.114 | 0.136 |
| 0.0 | -1 | 0 | 30 | -0.218 | 0.100 | -0.213 | 0.280 |
| 0.0 | 0 | 1 | 101 | 0.006 | 0.010 | 0.006 | 0.115 |
| 0.0 | 1 | 0 | 31 | 0.284 | 0.161 | 0.280 | 0.328 |
| 0.2 | 0 | 0 | 22 | 0.106 | 0.136 | 0.108 | 0.212 |
| 0.2 | 1 | 1 | 25 | 0.435 | 0.400 | 0.409 | 0.486 |
| 0.5 | 0 | 0 | 13 | 0.358 | 0.538 | 0.365 | 0.423 |
| 0.5 | 1 | 1 | 46 | 0.632 | 0.848 | 0.636 | 0.709 |
| 0.8 | 1 | 1 | 58 | 0.988 | 0.966 | 0.954 | 0.848 |
(ignoring response)
all_data_sub %>%
group_by(effect_size) %>%
summarize(
n = n(),
correct_pcnt = mean(correct),
mean_d = mean(-d, na.rm = T), #note -d because d in dataset is calculated in opposite diferection!
power = mean(pvalue <= 0.05),
mean_obs_d = mean(obs_mean_dif, na.rm = T),
obs_power = mean(obs_power, na.rm = T)
) %>%
mutate_if(is.double, round, 3) %>%
knitr::kable()
| effect_size | n | correct_pcnt | mean_d | power | mean_obs_d | obs_power |
|---|---|---|---|---|---|---|
| -0.8 | 70 | 0.886 | -0.910 | 0.914 | -0.920 | 0.816 |
| -0.5 | 54 | 0.889 | -0.674 | 0.870 | -0.655 | 0.712 |
| -0.2 | 47 | 0.426 | -0.233 | 0.213 | -0.231 | 0.279 |
| 0.0 | 162 | 0.623 | 0.018 | 0.056 | 0.018 | 0.187 |
| 0.2 | 49 | 0.510 | 0.278 | 0.265 | 0.266 | 0.353 |
| 0.5 | 60 | 0.767 | 0.566 | 0.767 | 0.571 | 0.639 |
| 0.8 | 58 | 1.000 | 0.988 | 0.966 | 0.954 | 0.848 |
(I’m not sure what’s going on here)
find_cover_region <- function(x, alpha=0.5) {
n <- length(x)
x <- sort(x)
k <- as.integer(round((1-alpha) * n))
i <- which.min(x[seq.int(n-k, n)] - x[seq_len(k+1L)])
c(x[i], x[n-k+i-1L])
}
tapply(-all_data_sub$d, all_data_sub$response, find_cover_region)
## $`-1`
## [1] -0.8249109 -0.3589261
##
## $`0`
## [1] -0.17304226 0.05205021
##
## $`1`
## [1] 0.3677225 0.8761518
tapply(all_data_sub$obs_mean_dif, all_data_sub$response, find_cover_region)
## $`-1`
## [1] -0.8770313 -0.4127584
##
## $`0`
## [1] -0.16894816 0.05487738
##
## $`1`
## [1] 0.2943237 0.7974972
Calculate cumulative means at each observation for each trial.
cummeans <- all_data_sub %>%
mutate(trial = row_number()) %>%
unnest() %>%
mutate(key = ifelse(val %in% c(1, 2), "grp", "val")) %>%
group_by(trial, key) %>%
mutate(obs_n = row_number()) %>%
ungroup() %>%
spread(key, val) %>%
mutate(obs_n = ceiling(obs_n/2),
grp = recode(grp, "1" = "A", "2" = "B")) %>%
spread(grp, val) %>%
mutate(val = B-A) %>%
group_by(trial) %>%
mutate(cummean = cumsum(val) / obs_n) %>%
ungroup()
Plot cumulative means at each observation for effect size by response.
cummeans %>%
ggplot(aes(obs_n, cummean, color = as.factor(trial))) +
geom_hline(aes(yintercept = effect_size), color = "grey50") +
geom_line(show.legend = FALSE) +
facet_grid(effect_size~response, labeller = label_both)
## Warning: Removed 295 rows containing missing values (geom_path).
ggsave("cummean.png", width = 10, height = 10)
## Warning: Removed 295 rows containing missing values (geom_path).
binwidth <- 10
cummeans %>%
mutate(length_bin = ceiling(length/2/binwidth) * binwidth) %>%
ggplot(aes(obs_n, cummean, group = as.factor(trial), color = as.factor(response))) +
geom_hline(aes(yintercept = effect_size)) +
geom_line(show.legend = FALSE) +
facet_grid(effect_size~length_bin, labeller = label_both, scales = "free_x") +
scale_color_manual(values = c(alpha("blue", .3),
alpha("black", .3),
alpha("red", .3))) +
scale_x_continuous(breaks = seq(0, 150, by = binwidth)) +
coord_cartesian(ylim = c(-2, 2)) +
ggtitle("Cumulative means for decision length bins")
## Warning: Removed 295 rows containing missing values (geom_path).
ggsave("cummean_binned.png", width = 20, height = 15)
## Warning: Removed 295 rows containing missing values (geom_path).
binwidth <- 10
cummeans %>%
filter(effect_size == 0) %>%
mutate(length_bin = ceiling(length/2/binwidth) * binwidth) %>%
ggplot(aes(obs_n, cummean, group = as.factor(trial), color = as.factor(response))) +
geom_hline(aes(yintercept = effect_size)) +
geom_line(show.legend = FALSE) +
facet_grid(response~length_bin, labeller = label_both, scales = "free_x") +
scale_color_manual(values = c(alpha("blue", .3),
alpha("black", .3),
alpha("red", .3))) +
scale_x_continuous(breaks = seq(0, 150, by = binwidth)) +
coord_cartesian(ylim = c(-2, 2)) +
ggtitle("Cumulative means for decision length bins where effect size = 0")
## Warning: Removed 97 rows containing missing values (geom_path).
ggsave("cummean_binned_0.png", width = 20, height = 15)
## Warning: Removed 97 rows containing missing values (geom_path).
Some model comparison of predictors of correct responses. “after_N” variables are how close the cumulative mean is to the effect size after 10, 20, 30, etc observation pairs. This is probably not a great analysis strategy.
maxlen <- 50
dat <- cummeans %>%
filter(length/2 >= maxlen, obs_n %in% seq(10, maxlen, by = 10)) %>%
mutate(obs_n = paste0("after_", obs_n),
close = abs(cummean - effect_size)) %>%
select(trial, effect_size, effect_size_abs, correct, obs_n, close) %>%
spread(obs_n, close)
mod10 <- glm(correct ~ effect_size_abs + after_10, family = binomial, data = dat)
summary(mod10)
##
## Call:
## glm(formula = correct ~ effect_size_abs + after_10, family = binomial,
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5538 -1.3335 0.9106 1.0296 1.0339
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.34721 0.26811 1.295 0.195
## effect_size_abs 0.62892 0.66069 0.952 0.341
## after_10 0.02099 0.54800 0.038 0.969
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.67 on 216 degrees of freedom
## Residual deviance: 288.74 on 214 degrees of freedom
## AIC: 294.74
##
## Number of Fisher Scoring iterations: 4
mod20 <- glm(correct ~ effect_size_abs + after_20, family = binomial, data = dat)
summary(mod20)
##
## Call:
## glm(formula = correct ~ effect_size_abs + after_20, family = binomial,
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6020 -1.3215 0.9020 0.9921 1.3348
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6582 0.2617 2.516 0.0119 *
## effect_size_abs 0.6109 0.6644 0.919 0.3579
## after_20 -1.2130 0.7640 -1.588 0.1124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.67 on 216 degrees of freedom
## Residual deviance: 286.20 on 214 degrees of freedom
## AIC: 292.2
##
## Number of Fisher Scoring iterations: 4
mod30 <- glm(correct ~ effect_size_abs + after_30, family = binomial, data = dat)
summary(mod30)
##
## Call:
## glm(formula = correct ~ effect_size_abs + after_30, family = binomial,
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6424 -1.2585 0.8302 0.9518 1.3886
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9335 0.2738 3.409 0.000651 ***
## effect_size_abs 0.6600 0.6747 0.978 0.327968
## after_30 -3.0202 1.0622 -2.843 0.004463 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.67 on 216 degrees of freedom
## Residual deviance: 280.32 on 214 degrees of freedom
## AIC: 286.32
##
## Number of Fisher Scoring iterations: 4
mod40 <- glm(correct ~ effect_size_abs + after_40, family = binomial, data = dat)
summary(mod40)
##
## Call:
## glm(formula = correct ~ effect_size_abs + after_40, family = binomial,
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7025 -1.2735 0.7998 0.9679 1.5374
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0038 0.2785 3.604 0.000314 ***
## effect_size_abs 0.6876 0.6781 1.014 0.310553
## after_40 -3.8297 1.2396 -3.089 0.002005 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.67 on 216 degrees of freedom
## Residual deviance: 278.48 on 214 degrees of freedom
## AIC: 284.48
##
## Number of Fisher Scoring iterations: 4
mod50 <- glm(correct ~ effect_size_abs + after_50, family = binomial, data = dat)
summary(mod50)
##
## Call:
## glm(formula = correct ~ effect_size_abs + after_50, family = binomial,
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7047 -1.2604 0.8367 0.9672 1.6472
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9085 0.2669 3.405 0.000663 ***
## effect_size_abs 0.6601 0.6715 0.983 0.325535
## after_50 -3.6915 1.3021 -2.835 0.004581 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.67 on 216 degrees of freedom
## Residual deviance: 280.39 on 214 degrees of freedom
## AIC: 286.39
##
## Number of Fisher Scoring iterations: 4
# compare models using AIC
library(AICcmodavg)
models <- list()
models[[1]] <- mod10
models[[2]] <- mod20
models[[3]] <- mod30
models[[4]] <- mod40
models[[5]] <- mod50
modelnames <- seq(10, maxlen, by = 10)
atab <- aictab(models, modelnames)
atab
##
## Model selection based on AICc:
##
## K AICc Delta_AICc AICcWt Cum.Wt LL
## 40 3 284.59 0.00 0.55 0.55 -139.24
## 30 3 286.43 1.85 0.22 0.77 -140.16
## 50 3 286.50 1.91 0.21 0.99 -140.19
## 20 3 292.32 7.73 0.01 1.00 -143.10
## 10 3 294.85 10.26 0.00 1.00 -144.37